Russia War Losses Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn
from sklearn.impute import KNNImputer
# For plotting
import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
# Took to map in notebook
import geopandas as gpd
from geopandas.tools import geocode
import folium
from scipy.stats import linregress
from branca.element import Figure
import zipfile
import datetime as dt
# For prediction and ML
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
!kaggle datasets download -d "piterfm/2022-ukraine-russian-war"
Downloading 2022-ukraine-russian-war.zip to C:\Users\Aadith Sukumar\Desktop\E-Learn\SIT\Second Year\Sem 3\Project\Russia-Ukraine War
0%| | 0.00/9.05k [00:00<?, ?B/s] 100%|##########| 9.05k/9.05k [00:00<00:00, 4.10MB/s]
# Unzipping the dataset downloaded
zipfile.ZipFile('2022-ukraine-russian-war.zip').extractall()
equip_losses=pd.read_csv("./russia_losses_equipment.csv")
person_losses=pd.read_csv("./russia_losses_personnel.csv")
equip_losses.head()
| date | day | aircraft | helicopter | tank | APC | field artillery | MRL | military auto | fuel tank | drone | naval ship | anti-aircraft warfare | special equipment | mobile SRBM system | greatest losses direction | vehicles and fuel tanks | cruise missiles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-02-25 | 2 | 10 | 7 | 80 | 516 | 49 | 4 | 100.0 | 60.0 | 0 | 2 | 0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 2022-02-26 | 3 | 27 | 26 | 146 | 706 | 49 | 4 | 130.0 | 60.0 | 2 | 2 | 0 | NaN | NaN | NaN | NaN | NaN |
| 2 | 2022-02-27 | 4 | 27 | 26 | 150 | 706 | 50 | 4 | 130.0 | 60.0 | 2 | 2 | 0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 2022-02-28 | 5 | 29 | 29 | 150 | 816 | 74 | 21 | 291.0 | 60.0 | 3 | 2 | 5 | NaN | NaN | NaN | NaN | NaN |
| 4 | 2022-03-01 | 6 | 29 | 29 | 198 | 846 | 77 | 24 | 305.0 | 60.0 | 3 | 2 | 7 | NaN | NaN | NaN | NaN | NaN |
person_losses.head()
| date | day | personnel | personnel* | POW | |
|---|---|---|---|---|---|
| 0 | 2022-02-25 | 2 | 2800 | about | 0.0 |
| 1 | 2022-02-26 | 3 | 4300 | about | 0.0 |
| 2 | 2022-02-27 | 4 | 4500 | about | 0.0 |
| 3 | 2022-02-28 | 5 | 5300 | about | 0.0 |
| 4 | 2022-03-01 | 6 | 5710 | about | 200.0 |
equip_losses.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 257 entries, 0 to 256 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 257 non-null object 1 day 257 non-null int64 2 aircraft 257 non-null int64 3 helicopter 257 non-null int64 4 tank 257 non-null int64 5 APC 257 non-null int64 6 field artillery 257 non-null int64 7 MRL 257 non-null int64 8 military auto 65 non-null float64 9 fuel tank 65 non-null float64 10 drone 257 non-null int64 11 naval ship 257 non-null int64 12 anti-aircraft warfare 257 non-null int64 13 special equipment 238 non-null float64 14 mobile SRBM system 36 non-null float64 15 greatest losses direction 181 non-null object 16 vehicles and fuel tanks 192 non-null float64 17 cruise missiles 192 non-null float64 dtypes: float64(6), int64(10), object(2) memory usage: 36.3+ KB
equip_losses.describe()
| day | aircraft | helicopter | tank | APC | field artillery | MRL | military auto | fuel tank | drone | naval ship | anti-aircraft warfare | special equipment | mobile SRBM system | vehicles and fuel tanks | cruise missiles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 257.000000 | 257.000000 | 257.000000 | 257.000000 | 257.000000 | 257.00000 | 257.000000 | 65.000000 | 65.000000 | 257.000000 | 257.000000 | 257.000000 | 238.000000 | 36.000000 | 192.000000 | 192.000000 |
| mean | 130.000000 | 202.540856 | 177.696498 | 1533.268482 | 3491.813230 | 836.18677 | 226.782101 | 1047.507692 | 69.323077 | 616.863813 | 12.140078 | 111.599222 | 76.760504 | 3.944444 | 3022.583333 | 193.989583 |
| std | 74.333707 | 61.332464 | 49.397792 | 706.934421 | 1313.876455 | 467.34927 | 97.472909 | 466.162060 | 7.545917 | 405.187265 | 4.297054 | 51.071095 | 42.980191 | 0.333333 | 674.695195 | 80.852072 |
| min | 2.000000 | 10.000000 | 7.000000 | 80.000000 | 516.000000 | 49.00000 | 4.000000 | 100.000000 | 60.000000 | 0.000000 | 2.000000 | 0.000000 | 10.000000 | 2.000000 | 1796.000000 | 84.000000 |
| 25% | 66.000000 | 190.000000 | 155.000000 | 1008.000000 | 2445.000000 | 436.00000 | 151.000000 | 600.000000 | 60.000000 | 232.000000 | 8.000000 | 77.000000 | 42.000000 | 4.000000 | 2508.750000 | 129.000000 |
| 50% | 130.000000 | 217.000000 | 187.000000 | 1584.000000 | 3744.000000 | 801.00000 | 246.000000 | 1178.000000 | 73.000000 | 654.000000 | 15.000000 | 105.000000 | 66.500000 | 4.000000 | 2962.000000 | 182.000000 |
| 75% | 194.000000 | 236.000000 | 206.000000 | 2068.000000 | 4459.000000 | 1157.00000 | 294.000000 | 1437.000000 | 76.000000 | 867.000000 | 15.000000 | 156.000000 | 115.500000 | 4.000000 | 3615.000000 | 239.250000 |
| max | 258.000000 | 278.000000 | 260.000000 | 2786.000000 | 5654.000000 | 1791.00000 | 391.000000 | 1701.000000 | 76.000000 | 1476.000000 | 16.000000 | 203.000000 | 159.000000 | 4.000000 | 4216.000000 | 399.000000 |
print("Size =",equip_losses.size)
print("Shape =",equip_losses.shape)
Size = 4626 Shape = (257, 18)
person_losses.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 257 entries, 0 to 256 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 257 non-null object 1 day 257 non-null int64 2 personnel 257 non-null int64 3 personnel* 257 non-null object 4 POW 62 non-null float64 dtypes: float64(1), int64(2), object(2) memory usage: 10.2+ KB
person_losses.describe()
| day | personnel | POW | |
|---|---|---|---|
| count | 257.000000 | 257.000000 | 62.000000 |
| mean | 130.000000 | 37564.225681 | 386.387097 |
| std | 74.333707 | 17782.889253 | 131.440363 |
| min | 2.000000 | 2800.000000 | 0.000000 |
| 25% | 66.000000 | 23200.000000 | 389.000000 |
| 50% | 130.000000 | 35970.000000 | 421.000000 |
| 75% | 194.000000 | 49800.000000 | 474.500000 |
| max | 258.000000 | 77170.000000 | 496.000000 |
print("Size =",person_losses.size)
print("Shape =",person_losses.shape)
Size = 1285 Shape = (257, 5)
mn.matrix(equip_losses, figsize=(20,5))
<AxesSubplot:>
equip_null = pd.DataFrame((equip_losses.isnull().sum())*100/equip_losses.shape[0]).reset_index()
equip_null.columns = ['Column Name', 'Null Values Percentage']
fig = plt.figure(figsize=(10,5))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=equip_null,color='blue')
plt.xticks(rotation =90,fontsize =7)
ax.axhline(50, ls='--',color='red')
plt.title("Percentage of NULL values in equipment losses data")
plt.ylabel("Null Values (%)")
plt.xlabel("Columns", )
plt.show()
fig,ax = plt.subplots(figsize=(15, 3))
equip_null=equip_null.set_index(equip_null['Column Name'])
sns.heatmap(pd.DataFrame(equip_null["Null Values Percentage"]).transpose(),cmap = 'coolwarm')
plt.show()
mn.matrix(person_losses, figsize=(15,2))
<AxesSubplot:>
person_null = pd.DataFrame((person_losses.isnull().sum())*100/person_losses.shape[0]).reset_index()
person_null.columns = ['Column Name', 'Null Values Percentage']
fig = plt.figure(figsize=(10,5))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=person_null,color='blue')
plt.xticks(rotation =90,fontsize =7)
ax.axhline(50, ls='--',color='red')
plt.title("Percentage of NULL values in personnel losses data")
plt.ylabel("Null Values (%)")
plt.xlabel("Columns", )
plt.show()
equip_null = pd.DataFrame((equip_losses.isnull().sum())*100/equip_losses.shape[0]).reset_index()
equip_null.columns = ['Column Name', 'Null Values Percentage']
equip_null
| Column Name | Null Values Percentage | |
|---|---|---|
| 0 | date | 0.000000 |
| 1 | day | 0.000000 |
| 2 | aircraft | 0.000000 |
| 3 | helicopter | 0.000000 |
| 4 | tank | 0.000000 |
| 5 | APC | 0.000000 |
| 6 | field artillery | 0.000000 |
| 7 | MRL | 0.000000 |
| 8 | military auto | 74.708171 |
| 9 | fuel tank | 74.708171 |
| 10 | drone | 0.000000 |
| 11 | naval ship | 0.000000 |
| 12 | anti-aircraft warfare | 0.000000 |
| 13 | special equipment | 7.392996 |
| 14 | mobile SRBM system | 85.992218 |
| 15 | greatest losses direction | 29.571984 |
| 16 | vehicles and fuel tanks | 25.291829 |
| 17 | cruise missiles | 25.291829 |
equip_losses.isnull().sum()
date 0 day 0 aircraft 0 helicopter 0 tank 0 APC 0 field artillery 0 MRL 0 military auto 192 fuel tank 192 drone 0 naval ship 0 anti-aircraft warfare 0 special equipment 19 mobile SRBM system 221 greatest losses direction 76 vehicles and fuel tanks 65 cruise missiles 65 dtype: int64
# Dropping columns "mobile SRBM system" and "military auto", "fuel tank" from equipment dataset
equip_losses = equip_losses.drop(['mobile SRBM system','military auto','fuel tank'],axis=1)
person_losses = person_losses.drop(['personnel*'],axis=1)
special_equipment = equip_losses['special equipment']
equip_losses.info()
equip_null
<class 'pandas.core.frame.DataFrame'> RangeIndex: 257 entries, 0 to 256 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 257 non-null object 1 day 257 non-null int64 2 aircraft 257 non-null int64 3 helicopter 257 non-null int64 4 tank 257 non-null int64 5 APC 257 non-null int64 6 field artillery 257 non-null int64 7 MRL 257 non-null int64 8 drone 257 non-null int64 9 naval ship 257 non-null int64 10 anti-aircraft warfare 257 non-null int64 11 special equipment 238 non-null float64 12 greatest losses direction 181 non-null object 13 vehicles and fuel tanks 192 non-null float64 14 cruise missiles 192 non-null float64 dtypes: float64(3), int64(10), object(2) memory usage: 30.2+ KB
| Column Name | Null Values Percentage | |
|---|---|---|
| 0 | date | 0.000000 |
| 1 | day | 0.000000 |
| 2 | aircraft | 0.000000 |
| 3 | helicopter | 0.000000 |
| 4 | tank | 0.000000 |
| 5 | APC | 0.000000 |
| 6 | field artillery | 0.000000 |
| 7 | MRL | 0.000000 |
| 8 | military auto | 74.708171 |
| 9 | fuel tank | 74.708171 |
| 10 | drone | 0.000000 |
| 11 | naval ship | 0.000000 |
| 12 | anti-aircraft warfare | 0.000000 |
| 13 | special equipment | 7.392996 |
| 14 | mobile SRBM system | 85.992218 |
| 15 | greatest losses direction | 29.571984 |
| 16 | vehicles and fuel tanks | 25.291829 |
| 17 | cruise missiles | 25.291829 |
x, y = person_losses['date'], person_losses['personnel']
# Base of figure
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode='markers',line_color="red"))
# Adding additional details
fig.update_layout(title="Russian Personnel Losses", xaxis_title="Date", yaxis_title="Count", margin=dict(l=0, r=0, t=50, b=50))
fig.show()
# Adding a time refernce list to match the size of dataframe
pred_list=[]
for i in range(person_losses.shape[0]):
pred_list.append(i)
pred_list
# Adding to dataframe
person_losses["time_ref"]=pred_list
person_losses
| date | day | personnel | POW | time_ref | |
|---|---|---|---|---|---|
| 0 | 2022-02-25 | 2 | 2800 | 0.0 | 0 |
| 1 | 2022-02-26 | 3 | 4300 | 0.0 | 1 |
| 2 | 2022-02-27 | 4 | 4500 | 0.0 | 2 |
| 3 | 2022-02-28 | 5 | 5300 | 0.0 | 3 |
| 4 | 2022-03-01 | 6 | 5710 | 200.0 | 4 |
| ... | ... | ... | ... | ... | ... |
| 252 | 2022-11-04 | 254 | 74840 | NaN | 252 |
| 253 | 2022-11-05 | 255 | 75440 | NaN | 253 |
| 254 | 2022-11-06 | 256 | 75930 | NaN | 254 |
| 255 | 2022-11-07 | 257 | 76460 | NaN | 255 |
| 256 | 2022-11-08 | 258 | 77170 | NaN | 256 |
257 rows × 5 columns
# Predicting the number of personnel losses for the next 5 years
X = person_losses['time_ref'].values.reshape(-1,1)
y = person_losses['personnel'].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
# Adding a longer range of time refernce to extend prediction into the future
pred_list=[]
for i in range(305):
pred_list.append(i)
pred_list = np.array(pred_list).reshape(-1,1)
pred = regressor.predict(pred_list)
# Plotting both the actual and predicted values on the same graph
fig = go.Figure()
fig.add_trace(go.Scatter(x=pred_list.reshape(-1), y=pred.reshape(-1), mode='lines',line_color="red",name="Predicted"))
fig.add_trace(go.Scatter(x=X_test.reshape(-1), y=y_test.reshape(-1), mode='markers',line_color="blue",name="Actual"))
fig.update_layout(title="Predicted vs Actual Russian Personnel Losses", legend=dict(x=0, y=1), xaxis_title="Date", yaxis_title="Count", margin=dict(l=0, r=0, t=50, b=50))
fig.show()
# Calculating the accuracy of model
print("Accuracy of the model =",round(regressor.score(X_test,y_test)*100,4),"%")
Accuracy of the model = 96.631 %
x = equip_losses['date']
y0 = equip_losses['aircraft']
y1 = equip_losses['helicopter']
y2 = equip_losses['anti-aircraft warfare']
y3 = equip_losses['drone']
y4 = equip_losses['tank']
y5 = equip_losses['APC']
y6 = equip_losses['naval ship']
y7 = equip_losses['field artillery']
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y0, mode='lines+markers', name='Aircraft'))
fig.add_trace(go.Scatter(x=x, y=y1, mode='lines+markers', name='Helicopter'))
fig.add_trace(go.Scatter(x=x, y=y2, mode='lines+markers', name='Anti-aircraft warfare'))
fig.add_trace(go.Scatter(x=x, y=y3, mode='lines+markers', name='Drone'))
fig.add_trace(go.Scatter(x=x, y=y4, mode='lines+markers', name='Tank'))
fig.add_trace(go.Scatter(x=x, y=y5, mode='lines+markers', name='APC'))
fig.add_trace(go.Scatter(x=x, y=y6, mode='lines+markers', name='Naval Ship'))
fig.add_trace(go.Scatter(x=x, y=y7, mode='lines+markers', name='Field Artillery'))
fig.update_layout(legend_orientation="h", legend=dict(x=0, y=1), title="Russian Equipment Losses", xaxis_title="Date", yaxis_title="Count", margin=dict(l=0, r=0, t=30, b=0))
fig.show()
location_loss = equip_losses['greatest losses direction']
location_loss
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
252 Lyman and Avdiivka
253 Lyman and Avdiivka
254 Lyman and Avdiivka
255 Lyman, Bakhmut and Avdiivka
256 Bakhmut and Avdiivka
Name: greatest losses direction, Length: 257, dtype: object
# Splitting the names according to duplicacy
location_loss = location_loss.dropna()
location_list = location_loss[(~location_loss.str.contains('and|,'))].unique().tolist()
location_list
['Sievierodonetsk', 'Izyum', 'Popasna', 'Slobozhanskyi', 'Novopavlivsk', 'Avdiivka', 'Kurakhove', 'Bakhmut', 'Lyman', 'Zaporizhzhia', 'Sloviansk', 'Kramatorsk', 'Mykolaiv', 'Kryvyi Rih', 'Donetsk']
# Filtering out data into a new dataframe
equip_losses2 = equip_losses[['date','greatest losses direction']]
equip_losses2 = equip_losses2.dropna()
equip_losses2['date'] = pd.to_datetime(equip_losses2['date'])
equip_losses2.replace(" and",",",regex=True, inplace=True)
# Converting object string into a list
equip_losses2['listlosses'] = "[1,2,3]"
j=0
for i in equip_losses2['greatest losses direction']:
list_losses=[]
list_losses.append(i)
equip_losses2.iloc[j,2] = list_losses
j+=1
equip_losses2.drop('greatest losses direction', axis=1, inplace=True)
equip_losses2['listlosses'] = equip_losses2['listlosses'].astype("string")
# equip_losses2.explode('listlosses')
equip_losses2.head(20)
| date | listlosses | |
|---|---|---|
| 59 | 2022-04-25 | ['Sievierodonetsk'] |
| 60 | 2022-04-26 | ['Sievierodonetsk'] |
| 61 | 2022-04-27 | ['Kurakhove, Izyum'] |
| 62 | 2022-04-28 | ['Zaporizhzhia, Izyum'] |
| 63 | 2022-04-29 | ['Izyum'] |
| 64 | 2022-04-30 | ['Izyum'] |
| 65 | 2022-05-01 | ['Izyum'] |
| 66 | 2022-05-02 | ['Izyum'] |
| 67 | 2022-05-03 | ['Izyum'] |
| 68 | 2022-05-04 | ['Izyum, Novopavlivsk'] |
| 69 | 2022-05-05 | ['Popasna'] |
| 70 | 2022-05-06 | ['Lyman, Kurakhove'] |
| 71 | 2022-05-07 | ['Slobozhanskyi'] |
| 72 | 2022-05-08 | ['Novopavlivsk'] |
| 73 | 2022-05-09 | ['Avdiivka'] |
| 74 | 2022-05-10 | ['Avdiivka'] |
| 75 | 2022-05-11 | ['Novopavlivsk, Kurakhove, Sievierodonetsk'] |
| 76 | 2022-05-12 | ['Kurakhove'] |
| 77 | 2022-05-13 | ['Kurakhove'] |
| 78 | 2022-05-14 | ['Kurakhove'] |
# Adding location counts
loc_city = []
loc_count = []
for i in location_list:
loc_city.append(i)
loc_count.append(location_loss[location_loss.str.contains(str(i))].count())
loc_uk = pd.DataFrame(loc_city, columns =['City'])
loc_uk['Count'] = loc_count
loc_uk = loc_uk.sort_values('Count',ascending=False)
loc_uk
| City | Count | |
|---|---|---|
| 7 | Bakhmut | 63 |
| 14 | Donetsk | 62 |
| 11 | Kramatorsk | 32 |
| 13 | Kryvyi Rih | 25 |
| 5 | Avdiivka | 22 |
| 6 | Kurakhove | 14 |
| 10 | Sloviansk | 12 |
| 8 | Lyman | 10 |
| 9 | Zaporizhzhia | 10 |
| 1 | Izyum | 8 |
| 0 | Sievierodonetsk | 7 |
| 12 | Mykolaiv | 4 |
| 4 | Novopavlivsk | 3 |
| 2 | Popasna | 1 |
| 3 | Slobozhanskyi | 1 |
# Adding Co-ordinates for mapping
latitude = [48.002777,
48.59441,
48.738968,
47.910483,
47.9833,
48.136596,
48.8667,
47.837800,
49.209999,
48.94831944,
46.975033,
55.5333,
46.679443,
48.628242,
50.18932]
longitude = [37.805279,
37.99983,
37.584351,
33.391783,
37.2667,
37.7491335,
37.6167,
35.138300,
37.260746,
38.49166111,
31.994583,
28.65,
29.974368,
38.372715,
36.42414]
loc_uk['Latitude']=latitude
loc_uk['Longitude']=longitude
loc_uk
| City | Count | Latitude | Longitude | |
|---|---|---|---|---|
| 7 | Bakhmut | 63 | 48.002777 | 37.805279 |
| 14 | Donetsk | 62 | 48.594410 | 37.999830 |
| 11 | Kramatorsk | 32 | 48.738968 | 37.584351 |
| 13 | Kryvyi Rih | 25 | 47.910483 | 33.391783 |
| 5 | Avdiivka | 22 | 47.983300 | 37.266700 |
| 6 | Kurakhove | 14 | 48.136596 | 37.749133 |
| 10 | Sloviansk | 12 | 48.866700 | 37.616700 |
| 8 | Lyman | 10 | 47.837800 | 35.138300 |
| 9 | Zaporizhzhia | 10 | 49.209999 | 37.260746 |
| 1 | Izyum | 8 | 48.948319 | 38.491661 |
| 0 | Sievierodonetsk | 7 | 46.975033 | 31.994583 |
| 12 | Mykolaiv | 4 | 55.533300 | 28.650000 |
| 4 | Novopavlivsk | 3 | 46.679443 | 29.974368 |
| 2 | Popasna | 1 | 48.628242 | 38.372715 |
| 3 | Slobozhanskyi | 1 | 50.189320 | 36.424140 |
# Importing map from folio
fig=Figure(height=550,width=850)
uk_map = folium.Map(location=[48.5, 32.5], zoom_start=5.5, tiles='OpenStreetMap', max_zoom=7, min_zoom=4.5)
for x,y,z,w in zip(loc_uk.Latitude, loc_uk.Longitude, loc_uk.City, loc_uk.Count):
#print(x,y)
folium.Marker(location=[x,y],popup=w,tooltip=z,icon=folium.Icon(color='white',icon='bomb',prefix='fa',icon_color='black')).add_to(uk_map)
fig.add_child(uk_map)
uk_map
“The war is a big disaster, and this disaster has a high price. With every meaning of this word. People lose money, reputation, quality of life, they lose freedom. But the main thing is that people lose their loved ones, they lose themselves.”
- Volodymyr Zelensky, President of Ukraine
Hopefully the war will too